Install/load libraries

#install.packages('tidycensus')
#install.packages('position_dodge')
#library(position_dodge)
library(scales)
library(tidycensus)
library(tidyverse)
library(data.table)
library(readxl)
#install.packages('maps')
library(maps)
library(stargazer)
#install.packages('R.utils')
library(ggrepel)
library(R.utils)
library(ggpubr)
#library(arm)
#install.packages("coefplot")
library(coefplot)
#library(dplyr)
library(plotly)

#library(ggcoefstats)


options(tigris_use_cache = TRUE)

Load cleaned redistricting data

This is the dataset which is the output of the data cleaning script.

#redist <- read.csv("./source_data/analysis_data.csv", stringsAsFactors = FALSE)

Data Cleaning and Merging

We worked with three separate data sets for our research project, which needed to be organized and merged to state-election year pairings for our analysis. Single district states were left in the data to be included for certain visualizations, but they would ultimately be dropped for most of the substantive analysis.

Redistricting Data

This data set began as state-redistricting map pairings, tied to the original census year that maps were based on. A significant complicating factor, however, is that several states have map revisions as a result of court challenges, creating inconsistent numbers of pairings. Fortunately, the data also has columns denoting the years a given map was active, which could be used to associate an individual map with the elections where it was active. Since this data frame would be merged with both of our other data sets, this is also also the step where we added dichotomous dummy variables for use in later analysis.

This was merged in several steps: 1. Filtering out state house and state senate maps, as well as the 2020 congressional maps 2. Filling in blank entries in the “Drawn.by” column with “Institution” 3. Identifying states that have revised maps (specifically FL, NC, OH, PA, RI, TX, VA) 4. Creating an additional data frame for pairing census cycles to election years 5. Doing a full_join on the data frames to create all possible pairings 6. Adding a dummy variable for November 1st of each year as a cutoff point to mark election dates 7. Using the above to create another dummy variable to mark election year-map pairings where a map was inactive and filter them out 8. Adding a dummy variable for which states used more than one redistricting method in the sample window 9. Adding a variety of dichotomous dummy variables to identify different redistricting methods

redistricting_raw <- read.csv("./source_data/StatesAndCyclesData.csv", stringsAsFactors = FALSE)

# 1. Filtering out state house and state senate maps, as well as the 2020 congressional maps
# 2. Filling in blank entries in the "Drawn.by" column with "Institution"
redistricting_raw <- redistricting_raw %>%
  filter(Level == "Congress", Cycle.Year < 2020) %>%
  mutate(Drawn.by = ifelse(Drawn.by == "", Institution, Drawn.by))

# 3. Identifying states that have revised maps (FL, NC, OH, PA, RI, TX, VA)
problem_states <- redistricting_raw %>%
  count(State) %>%
  filter(n > 2) %>%
  select(State) %>%
  pull(State)

# 4. Creating an additional data frame for pairing census cycles to election years
cycle_pairs <- rbind(
  data.frame(Cycle.Year = 2000, Year = c(2002, 2004, 2006, 2008, 2010)),
  data.frame(Cycle.Year = 2010, Year = c(2012, 2014, 2016, 2018, 2020)))

# 5. Doing a full_join on the data frames to create all possible pairings
# 6. Adding a dummy variable for November 1st of each year as a cutoff point to mark election dates
redistricting_joined <- full_join(redistricting_raw, cycle_pairs, by = "Cycle.Year") %>%
  mutate(Election.Cutoff = paste(Year, "-11-01", sep = ""),
         Start.Date = ifelse((Start.Date == "") & (Seats == 1), "2001-12-31", Start.Date),
         End.Date = ifelse((End.Date == "") & (Seats == 1), "2001-12-31", End.Date))

# 7. Using the above to create another dummy variable to mark election year-map pairings where a map was inactive and filter them out
redistricting <- redistricting_joined %>%
  mutate(Inactive = ifelse(!(State %in% problem_states), F,
                    ifelse(difftime(redistricting_joined$Start.Date, Election.Cutoff) > 0 |
                           difftime(redistricting_joined$End.Date, Election.Cutoff) < 0, T, F))) %>%
  filter(Inactive == FALSE)

# 8. Adding a dummy variable for which states used more than one redistricting method in the sample window
redistricting_changes = redistricting %>%
  group_by(State) %>%
  summarize(redistricting_methods = n_distinct(Drawn.by)) %>%
  mutate(changed = ifelse(redistricting_methods > 1, 1, 0))

redistricting <- left_join(redistricting, redistricting_changes)

# Example showing the successful isolation of map revisions and accurate pairings
redistricting %>%
  filter(State %in% problem_states) %>%
  select(State, Drawn.by, Year, Start.Date, End.Date, changed)
# 9. Adding a variety of dichotomous dummy variables to identify different redistricting methods for later analysis
redistricting <- redistricting %>%
  mutate(legislature = ifelse(Drawn.by == "Legislature", 1, 0),
         partisan = ifelse((Drawn.by == "Legislature") |
                           (Drawn.by == "Politician commission") , 1, 0),
         independent_commission = ifelse(Drawn.by == "Independent commission", 1, 0),
         court = ifelse(str_detect(tolower(Drawn.by), "court"), 1, 0),
         mid_term = ifelse(Year %% 4 == 2, 1, 0))

redistricting %>%
  select(Year, State, Drawn.by, changed, legislature, partisan,
         independent_commission, court, mid_term)

Election results

The MIT election lab data was more granular than necessary for our analysis, containing state-election year-district-candidate observations with vote totals for each candidate. It was aggregated to the district level, pivoted wider to create state-election year-district observations, and then merged with the redistricting data. Third party candidates were grouped as “OTHER” for the aggregation, and Minnesota’s Democratic party was renamed to match the national party.

elec <- read.csv("./source_data/1976-2020-house.csv")

votes <- elec %>%
  filter(year >= 2002) %>%
  mutate(party = ifelse(party == "REPUBLICAN", "REPUBLICAN",
                 ifelse(party == "DEMOCRAT", "DEMOCRAT",
                 ifelse(str_detect(party, "DEMOCRATIC-FARM"), "DEMOCRAT",
                        "OTHER")))) %>%
  group_by(year, state_po, district, party) %>%
  summarize(votes = sum(candidatevotes), totalvotes = mean(totalvotes)) %>%
  pivot_wider(names_from = party, values_from = votes, values_fill = 0) %>%
  left_join(redistricting, by = c("year"="Year", "state_po"="State")) %>% 
  mutate(dem_margin = 100*(DEMOCRAT-REPUBLICAN)/totalvotes)

write.csv(votes, "votes_redistricting.csv", row.names=FALSE)
votes


Census turnout Data

This data set contained State-election year-voter Keep only for required span - 2000 to 2020 Drop states with only one Congressional district. Then, convert the party variable to have only three categories - Republican, Democrat and Other. Pivot to wide, and get vote totals and shares.

Census data was available for the current census and historical data (separated). Current data is combined, historical data is by presidential/midterms. We import the relevant cells from the Excel sheets and merge them as needed.

## New names:
## New names:
## New names:
## * `` -> `...1`
## * `` -> `...2`
## * `` -> `...3`
## * `` -> `...4`
## * `` -> `...5`
## * `` -> `...6`
## * `` -> `...7`
## * `` -> `...8`
## * `` -> `...9`
## * `` -> `...10`
## * `` -> `...11`

Merge Census and redistricting data

redistricting <- left_join(redistricting, states,
                           by = c("State" = "state_po")) %>%
                 mutate(Cycle.Year = as.numeric(Cycle.Year),
                        state_full = tolower(state_full))

redist_turn <- left_join(redistricting, all_years_long,
                         by = c("state_full" = "turnout_state", "Year" = "turnout_year"))

Download census data using API - state boundaries

https://rconsortium.github.io/censusguide/r-packages-all.html

https://walker-data.com/tidycensus/articles/spatial-data.html

The following code uses the tidycensus package and the Census API to download data + shapefiles from the US Census.

`################

#Declare parameters - API key, variables, time

census_api_key("2f1473c692f61175605ea04cbe2a9a1b41d5bf7c")

yr_select <- c(2010)

#call DF

stpop <- get_decennial(geography = "state", 
                        variables = "P003001", 
                         geometry = TRUE,
                        year = yr_select) %>% select(-variable) %>%
  filter(NAME!="Puerto Rico", NAME!="Alaska", NAME!="Hawaii" )
  
stpop$NAME <- tolower(stpop$NAME) 


###############################

Merge both dataframes

Create a spatial dataframe by merging both datasets.

election_map<-merge(stpop, redist_turn, by.x = "NAME", by.y = "state_full" )

Descriptive/graphical analysis

Graph 1: State redistricting policies

We filter the dataframe to a single year, say, 2012. Then we use ‘ggplot’ with the ‘geom_sf’ function to make a choropleth map, by unordered categorical variable ‘Drawn.by’. This variable includes a string indicating which type of redistricting policy was in effect in a state.

#idvars = c("NAME", "GEOID", "geometry")

# Visualization

##Maps

map2012 <- election_map %>% filter(Year==2012)

## democratic votes

redist_map <- map2012 %>% 
  ggplot() + 
  geom_sf(aes(fill=Drawn.by), color="black", size=0.1) +
    geom_sf_text(aes(label = State), family = "Times", size = 2.6,
               color = "black", alpha = 0.9, hjust = 0.5, vjust = 0.5,
               label.padding = unit(0, "pt")) +
  labs(title = "Redistricting policies in 2012")+
  theme(plot.title =element_text(hjust=0.5, face="bold"), #moves title to the middle of graph, bolds it
          text= element_text(family= "Times", size = )) +
   scale_fill_manual(values = c("#80b1d3", "#ffffb3", "#98FB98", "#fb8072", "#8dd3c7" , "#436EEE"))
redist_map

#ggsave("./images/redist_map_2012.png", redist_map)

Graph 2: Boxplot of average turnout

The following makes a boxplot depicting level of turnout for all elections, categorised by the state’s redistricting policy.

#Filter out single district states, and only till 2021.
redist2 <- redist_turn %>% filter(Drawn.by!="n/a") %>%
  filter(Seats != 1,
         Year < 2022)


# Boxplot

##Make a smaller DF of average turnout by redistricting policy
turnout_by_drawnby <- redist2 %>%
  group_by(Drawn.by) %>%
  summarise(turnout_perc = mean(turnout_perc))

#now make boxplot
#TODO: add coloring same as US map coloring
boxplt<- redist2 %>%
  ggplot(aes(x = Drawn.by, y = turnout_perc)) +
  geom_boxplot() +
    geom_text(data = turnout_by_drawnby,
            aes(x = Drawn.by, y = turnout_perc, label = round(turnout_perc, 0)),
            color = "red",
            size = 3,
            hjust = 0.5,
            vjust = 0.5
          #  position = position_dodge(width=1)
            ) +
    theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip() +
  labs(title = "Avg. Turnout from 2000-2020, by Category",
       x = "",
       y = "Percent of eligible voters who voted") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.title = element_text(face =))

#ggsave("./images/boxplot.png", boxplt)
boxplt

Graph 3: Lineplot of average turnout

a. By state

### By state
election_map %>%
  ggplot(aes(x=Year, y = turnout_perc, colour = State)) + 
  geom_line(size = 0.5) + 
  labs(title = "Dem and Rep voter turnout from 2000-2020", x = "year", y = "percentage of vote by party") + 
  
  theme(plot.title = element_text(hjust = 0.5))

b. By redistricting policy

# Aggregate to institution
inst_lineplt <- election_map %>% group_by(Drawn.by, Year) %>% summarise(avg_turnout = mean(turnout_perc)) %>% filter(Drawn.by!="n/a")

lesscom <- inst_lineplt %>% filter(Drawn.by %in% c("State court", "Federal court", "Legislature", "Independent commission"))

lineplt <- lesscom  %>%
   ggplot(aes(x=Year, y = avg_turnout, colour = Drawn.by)) + 
  geom_line(size = .7) + 
  labs(title = "Avg. Turnout percentage 2000-2020, by redistricting ", x = "year", y = "Turnout percentage") +
  theme(plot.title = element_text(hjust = 0.5))

lineplt

#ggsave("./images/lineplt_trn.html", lineplt)

Regression model: Ordinary Least Squares

We run three regression models with the same basic model.

Model 1: Full-sample, Midterm dummy

Our main independent variable derives from the ‘Institution’ or ‘Drawn.By’ columns. For the first set of OLS results, we run three models.

\(y_{i} = \beta_0 + \beta_1 Policy_{i} + \beta_2 Midterm_{i} + \epsilon_{i}\)

turn_reg_all <- lm(turnout_perc ~ Drawn.by + mid_term, data = redist2)

Model 2: Midterms vs Presidential sample, no midterm dummy

Run the above model without the midterm dummy, separately for midterm elections and presidential elections. We use the ‘mid_term’ indicator variable generated.

\(y_{i} = \beta_0 + \beta_1 Policy_{i} + \epsilon_{i}\)

#Subset to midterms, and estimate the model
redist_mid <- redist2 %>% filter(mid_term==1)
Midterm <- lm(turnout_perc ~ Drawn.by+ mid_term, data = redist_mid)

#Subset to Presidential elections, and estimate the model

redist_pres <- redist2 %>% filter(mid_term==0)
Presidential <- lm(turnout_perc ~ Drawn.by + mid_term, data = redist_pres)

OLS Regression: Results

Estimates table

We use stargazer to generate a formatted table of the regression coefficients for the three estimates models.

stargazer(turn_reg_all, Midterm, Presidential, type ="html")
Dependent variable:
turnout_perc
(1) (2) (3)
Drawn.byFederal court 2.594 2.178 3.096
(1.729) (2.358) (2.601)
Drawn.byIndependent commission -0.099 -0.429 0.321
(1.740) (2.375) (2.613)
Drawn.byLegislature 2.019 1.883 2.253
(1.563) (2.087) (2.399)
Drawn.byPolitician commission -4.606** -6.170** -2.945
(1.879) (2.583) (2.797)
Drawn.byState court 3.147* 3.313 3.078
(1.657) (2.237) (2.516)
mid_term -5.138***
(0.468)
Constant 70.343*** 65.400*** 70.050***
(1.559) (2.042) (2.364)
Observations 430 215 215
R2 0.285 0.132 0.075
Adjusted R2 0.275 0.111 0.053
Residual Std. Error 4.850 (df = 423) 5.002 (df = 209) 4.727 (df = 209)
F Statistic 28.142*** (df = 6; 423) 6.329*** (df = 5; 209) 3.379*** (df = 5; 209)
Note: p<0.1; p<0.05; p<0.01

Graph 4: Coefficient plot

Use models 2 and 3 above to make a coefficient plot using the ‘multiplot’ package. It shows that the coefficient estimates are always more negative or less positive for midterms, compared to Presidential elections.

# Make a coefplot
multiplot(Midterm, Presidential, intercept=FALSE)

Regression model: Panel with fixed effects

Estimating OLS models with panel data may lead to biased and imprecise estimates. The reasons for this:

We use the following panel regression model:

\(y_{it} = \beta_0 + \beta_1 Policy_{it} + \beta_2 state_i + \beta_3 year_t + \epsilon_{it}\)

Where \(y_{it} =\) Turnout percentage for state i in year t \(x_{it} =\) Binary variable for policy (court, legislature~~ or partisan) \(state_i =\) State fixed effects \(year_t =\) Year fixed effects \(\epsilon_{it} =\) Error term

We use the R plm package

We use the three binary variables generated to indicate the state’s redistricting policy

  1. Get dummies for each category (court, legislature, independent etc)

    Make custom variables ‘partisan’=1 or 0, and categorize the above cases into it.

library(plm)
## Warning: package 'plm' was built under R version 4.1.3
## 
## Attaching package: 'plm'
## The following object is masked from 'package:data.table':
## 
##     between
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead
plm1 <- plm(turnout_perc ~ partisan + mid_term, data=redist2, index=c("State", "Year"), model="within", effect = "twoways")
plm2 <- plm(turnout_perc ~ legislature + mid_term, data=redist2, index=c("State", "Year"), model="within")
plm3 <- plm(turnout_perc ~ court, data=redist2, index=c("State", "Year"),
            model="within", effect = "twoways")

stargazer(plm1, plm3,  type='text',
add.lines=list(c('State fixed effects', 'Yes','Yes'),  c('Year fixed effects', 'Yes','Yes')))

====================================================== Dependent variable:
—————————- turnout_perc
(1) (2)
—————————————————— partisan -0.969**
(0.491)

court 1.260**
(0.514)


State fixed effects Yes Yes
Year fixed effects Yes Yes
Observations 430 430
R2 0.010 0.016
Adjusted R2 -0.126 -0.120
F Statistic (df = 1; 377) 3.893** 6.010**
====================================================== Note: p<0.1; p<0.05; p<0.01

Part - 2: Analysis of vote shares

Merge Vote share data

This section adds the elections data, and performs basic cleaning/transformation to merge it with the previously made redistricting data frame. This produces a new data frame with district-level observations, and each row containing the number of votes for each candidate (Republican/Democrat/Other), as well as the redistricting method for that election.

Note: Minnesota’s Democratic party is known as the “Democratic-Farmer-Labor” party and required special handling.

elec <- read.csv("./source_data/1976-2020-house.csv")

votes <- elec %>%
  filter(year >= 2002) %>%
  mutate(party = ifelse(party == "REPUBLICAN", "REPUBLICAN",
                 ifelse(party == "DEMOCRAT", "DEMOCRAT",
                 ifelse(str_detect(party, "DEMOCRATIC-FARM"), "DEMOCRAT",
                        "OTHER")))) %>%
  group_by(year, state_po, district, party) %>%
  summarize(votes = sum(candidatevotes), totalvotes = mean(totalvotes)) %>%
  pivot_wider(names_from = party, values_from = votes, values_fill = 0) %>%
  left_join(redist2, by = c("year"="Year", "state_po"="State"))

votes

The visualization filters the results to elections in states with more than one district, and also excludes races where a Democrat or Republican received zero votes (i.e. they were not on the ballot).

votes %>%
  filter(DEMOCRAT > 0, REPUBLICAN > 0, Seats > 1) %>%
  mutate(winner = ifelse(REPUBLICAN > DEMOCRAT, "Republican", "Democrat")) %>%
  ggplot(aes(x = DEMOCRAT, y = REPUBLICAN)) +
  geom_point(alpha = 0.1) +
  scale_y_continuous(labels = label_comma(suffix = "k", scale = 0.001), limits = c(0, 300000)) +
  scale_x_continuous(labels = label_comma(suffix = "k", scale = 0.001), limits = c(0, 300000)) +
  facet_wrap(~Drawn.by)

Proportion of votes and margins

Margins in states

# Dem share

votes <- votes %>% 
  mutate(dem_margin = 100*(DEMOCRAT-REPUBLICAN)/totalvotes)


# Create a new data frame with the avg_margin values grouped by year and Drawn.by

inst_lineplt <- votes %>% group_by(Drawn.by, year) %>% summarise(avg_margin = mean(dem_margin))


inst_lineplt %>% 
   ggplot(aes(x=year, y = avg_margin, colour = Drawn.by)) + 
  geom_line(size = .7) + 
  labs(title = "Vote share margins, by redistricting ", x = "year", y = "Dem Margin in (%)") +
    
  theme(plot.title = element_text(hjust = 0.5))


votes %>% filter(Drawn.by=="Legislature") %>%
  filter(DEMOCRAT > 0, REPUBLICAN > 0, Seats > 1) %>%
  ggplot(aes(x = dem_margin)) +
    geom_histogram(binwidth = 3) +
    geom_vline(xintercept = 0, color = "black") +
    labs(title = "Democratic vote margin (Legislature)",
         x = "Margin (percent)",
         y = "Frequency")